#!/bin/bash 
#
# Tesseract OCR batchrun
#
# this script uses the xdialog GUI
#
# 2010-03-13 JvO, new script
# 2011-11-30 JvO, adapted for scanning booklets (rotate and split scan)
#

clear

#
# Image size (pixels)
# TODO get the picture info with Netpbm pamfile
#
## 300dpi scan, B/W (1bit)
original_width=1416;
original_height=2854;

width=$((original_width/2))
height=$((original_height/2))

#
# variables used
#
resultText="OCRText.txt"; # the resulting text outputfile
tempTIFF1="temp1.tif";
tempTIFF2="temp2.tif";

#
# choose the directory with the images to be scanned
#
inputdir=$(xdialog --stdout --dselect\
            ~/Desktop\
            24 80)

#
# show settings
#
echo "Directory : $inputdir";
echo "Imagewidth: $width";
echo "    Height: $height";

#
# process all (TIFF) images in the selected directory
#
cd "${inputdir}";
echo "" > ${resultText};

for file in $(ls | grep '\.tif$' | grep '^[^temp]') ## Unix powertools p.730
do
   ## the file being processed
   filename=$(basename "${file##*/}" .tif)
   echo "Filename: $filename";

   ## show image info
   ##echo `tifftopnm < $file | pamfile;`;
   
   #
   # preprocess TIFF image
   # modify this to your needs
   # Netpbm programs' overview: http://netpbm.sourceforge.net/doc/directory.html#converters

   # rotate and split the imagefile
   tifftopnm < $file | \
   pamditherbw -threshold -value 0.8 | \
   pamtopnm |\
   pamflip -rotate270 | 
   pamdice -width=$height -height=$original_width -outstem=$filename;
  
   # convert split PBM images to TIFF
   pnmtotiff ${filename}_0_0.pbm > $tempTIFF1;
   pnmtotiff ${filename}_0_1.pbm > $tempTIFF2;

   # and finally perform the Optical character recoginition
   # Usage:tesseract imagename outputbase [-l lang] [configfile [[+|-]varfile]...]
   tesseract ${tempTIFF1} ${filename}.1 -l nld;
   tesseract ${tempTIFF2} ${filename}.2 -l nld;

   # append text to result textfile
   cat ${filename}.1.txt >> ${resultText};
   cat ${filename}.2.txt >> ${resultText};
   
   # cleanup
   rm ${filename}_0_0.pbm;
   rm ${filename}_0_1.pbm;
   rm ${filename}.1.txt;
   rm ${filename}.2.txt;
   
done

## cleanup
rm $tempTIFF1;
rm $tempTIFF2;

echo "==========";
echo "OCR Ready!";
echo "==========";

exit 0;